Your client is a mental health expert from an NGO who is interested in understanding more about gaming and the potentially addictive effect it can have on some individuals. You are meeting the client in a few days and they would like you to extract and present insights from the Steam dataset to help them in their research.
Please use whichever tools you feel the most comfortable with, but we do recommend Tableau which is a popular choice. Tableau is free for students and there is also a free trial available here.
# Make sure script changes take effect within this session
%load_ext autoreload
%autoreload 2
# import some useful packages for this analysis, start spark session
from setup import *
spark
Open localhost:4040 to monitor the spark UI
dfs = ['Player_Summaries', 'Games_Publishers', 'Games_Genres', 'Games_Developers', 'Games_1', 'Games_2', 'Friends', 'Groups', 'App_ID']
spark_handler = spark_df_handler()
print('Number of records \n')
for each in dfs:
spark_handler.load(each)
print('{0} : {1}'.format(each, str(spark_handler.dfraw[each].count())))
# Create timestamp vars
player_summaries = spark_handler.dfraw[dfs[0]]
player_summaries = player_summaries.withColumn('datetimecreated', F.to_timestamp(player_summaries.timecreated,'yyyy-MM-dd HH:mm:ss'))
player = player_summaries.select('steamid', 'primaryclanid', 'datetimecreated')
player_signup = player.withColumn('year_created', F.date_trunc('year', player_summaries['datetimecreated']))
player_signup = player_signup.withColumn('month_created', F.date_trunc('month', player_signup['datetimecreated']))
player_signup = player_signup.withColumn('day_created', F.date_trunc('day', player_signup['datetimecreated']))
player_signup = player_signup.withColumn('dayofweek_created', F.dayofweek('datetimecreated').cast('byte'))
player_signup = player_signup.withColumn('dayhour_created', F.date_trunc('hour', player_signup['datetimecreated']))
player_signup = player_signup.withColumn('hour_created', F.hour('datetimecreated').cast('byte'))
# concatenate game tables
games = spark_handler.dfraw[dfs[4]] #.union(spark_handler.dfraw[dfs[5]])
app_info = spark_handler.dfraw[dfs[8]]
games = games.join(app_info, 'appid', how = 'left')
player_stats = games.groupby('steamid').agg(F.sum('playtime_2weeks').alias('playtime_2weeks_player'),
F.sum('playtime_forever').alias('playtime_forever_player'),
F.countDistinct('appid').alias('game_count_player'),
F.sum('Is_Multiplayer').alias('multiplayer_count_player'),
F.sum('Price').alias('spending_player'))
player_stats = player_stats.fillna(0)
player_stats = player_stats.withColumn('playtime_average_player', F.col('playtime_2weeks_player') / (14 * 60))
player_stats = player_stats.drop('playtime_2weeks_player')
player_stats = player_stats.withColumn('multiplayer_fraction_player', F.col('multiplayer_count_player') / F.col('game_count_player'))
# player_stats_pd = player_stats.toPandas()
# player_stats_pd.to_csv(os.path.join(path, '../player_stats.csv'))
# player_stats_pd = pd.read_csv(os.path.join(path, '../player_stats.csv'), index_col = False)
groups = spark_handler.dfraw[dfs[7]]
group_stats = groups.groupby('steamid').agg(F.countDistinct('groupid').alias('group_count_player'))
friends = spark_handler.dfraw[dfs[6]]
friends = friends.withColumn('friend_since', F.to_timestamp(friends.friend_since,'yyyy-MM-dd HH:mm:ss'))
friends = friends.withColumn('friend_since_day', F.date_trunc('day', friends['friend_since']))
friends = friends.withColumn('friend_since_month', F.date_trunc('month', friends['friend_since']))
friends_stats = friends.join(player_stats, friends.steamid_b == player_stats.steamid, how = 'left')
agg_friends_stats = friends_stats.groupby('steamid_a').agg(F.countDistinct('steamid_b').alias('friend_count'),
F.mean('playtime_average_player').alias('playtime_average_friends'),
F.max('playtime_average_player').alias('playtime_average_friends_max'),
F.mean('playtime_forever_player').alias('playtime_forever_friends'),
F.max('playtime_forever_player').alias('playtime_forever_friends_max'),
F.mean('game_count_player').alias('game_count_friends'),
F.mean('multiplayer_count_player').alias('multiplayer_count_friends'),
F.mean('multiplayer_fraction_player').alias('multiplayer_fraction_friends'),
F.mean('spending_player').alias('spending_friends'))
user_summary = player_stats.join(player, 'steamid', how = 'left')
user_summary = user_summary.join(group_stats, 'steamid', how = 'left')
user_summary = user_summary.join(agg_friends_stats, user_summary.steamid == agg_friends_stats.steamid_a, how = 'left')
user_summary_pd = user_summary.toPandas()
user_summary_pd.to_csv(os.path.join(path, '../user_summary.csv'))
user_summary_pd = pd.read_csv(os.path.join(path, '../user_summary.csv'), index_col = False)
user_summary_pd.describe()
player_location = player_summaries.groupby('loccountrycode').count().toPandas()
world_borders = gpd.read_file(os.path.join(path, '../geo_files/TM_WORLD_BORDERS-0.3.shp'))
map_df = world_borders.merge(player_location, how = 'left', left_on = 'ISO2', right_on = 'loccountrycode')
map_df['player_density'] = map_df['count'] / map_df.AREA
map_df['player_per_cap'] = map_df['count'] / map_df.POP2005 * 100
map_df = map_df.replace(np.inf, 0).fillna(0)
m = folium.Map(location=[50, 0],
tiles='CartoDB positron',
zoom_start=1.5,
control_scale = True)
colorscale_t = branca.colormap.linear.YlOrRd_09.scale(0, 100000)
colorscale_t.caption = '# of players registered'
folium.GeoJson(map_df, name='players per country',
style_function=lambda x: {"weight": 0.2, 'color': 'black', 'fillColor':colorscale_t(x['properties']['count']), 'fillOpacity':0.9},
highlight_function=lambda x: {'weight':1, 'color':'black'},
smooth_factor=1.0,
show = True,
tooltip=folium.features.GeoJsonTooltip(fields = ['NAME','count', 'player_density', 'player_per_cap'],
aliases=['Country','Number of Players', 'Players / 10 sqkm', 'Players / 100 people' ],
labels=True,
sticky=True,
localize=True
)).add_to(m)
folium.LayerControl().add_to(m)
colorscale_t.add_to(m, 'scale')
m.save(os.path.join(git, '../html_maps/players_per_country.html'))
m